Sys.Date()
## [1] "2020-12-18"
Authors: Hanna Klimczak, Kamil Pluciński
library(plotly)
library(knitr)
library(kableExtra)
library(caret)
To ensure that running the notebook will always return the same output, we set seed to 0.
set.seed(0)
data <- read.csv("Life_Expectancy_Data.csv")
kable(head(data), "html") %>% kable_styling("striped") %>% scroll_box(width = "100%")
| Country | Year | Status | Life.expectancy | Adult.Mortality | infant.deaths | Alcohol | percentage.expenditure | Hepatitis.B | Measles | BMI | under.five.deaths | Polio | Total.expenditure | Diphtheria | HIV.AIDS | GDP | Population | thinness..1.19.years | thinness.5.9.years | Income.composition.of.resources | Schooling |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Afghanistan | 2015 | Developing | 65.0 | 263 | 62 | 0.01 | 71.279624 | 65 | 1154 | 19.1 | 83 | 6 | 8.16 | 65 | 0.1 | 584.25921 | 33736494 | 17.2 | 17.3 | 0.479 | 10.1 |
| Afghanistan | 2014 | Developing | 59.9 | 271 | 64 | 0.01 | 73.523582 | 62 | 492 | 18.6 | 86 | 58 | 8.18 | 62 | 0.1 | 612.69651 | 327582 | 17.5 | 17.5 | 0.476 | 10.0 |
| Afghanistan | 2013 | Developing | 59.9 | 268 | 66 | 0.01 | 73.219243 | 64 | 430 | 18.1 | 89 | 62 | 8.13 | 64 | 0.1 | 631.74498 | 31731688 | 17.7 | 17.7 | 0.470 | 9.9 |
| Afghanistan | 2012 | Developing | 59.5 | 272 | 69 | 0.01 | 78.184215 | 67 | 2787 | 17.6 | 93 | 67 | 8.52 | 67 | 0.1 | 669.95900 | 3696958 | 17.9 | 18.0 | 0.463 | 9.8 |
| Afghanistan | 2011 | Developing | 59.2 | 275 | 71 | 0.01 | 7.097109 | 68 | 3013 | 17.2 | 97 | 68 | 7.87 | 68 | 0.1 | 63.53723 | 2978599 | 18.2 | 18.2 | 0.454 | 9.5 |
| Afghanistan | 2010 | Developing | 58.8 | 279 | 74 | 0.01 | 79.679367 | 66 | 1989 | 16.7 | 102 | 66 | 9.20 | 66 | 0.1 | 553.32894 | 2883167 | 18.4 | 18.4 | 0.448 | 9.2 |
nrow(data)
## [1] 2938
summary(data)
## Country Year Status Life.expectancy
## Length:2938 Min. :2000 Length:2938 Min. :36.30
## Class :character 1st Qu.:2004 Class :character 1st Qu.:63.10
## Mode :character Median :2008 Mode :character Median :72.10
## Mean :2008 Mean :69.22
## 3rd Qu.:2012 3rd Qu.:75.70
## Max. :2015 Max. :89.00
## NA's :10
## Adult.Mortality infant.deaths Alcohol percentage.expenditure
## Min. : 1.0 Min. : 0.0 Min. : 0.0100 Min. : 0.000
## 1st Qu.: 74.0 1st Qu.: 0.0 1st Qu.: 0.8775 1st Qu.: 4.685
## Median :144.0 Median : 3.0 Median : 3.7550 Median : 64.913
## Mean :164.8 Mean : 30.3 Mean : 4.6029 Mean : 738.251
## 3rd Qu.:228.0 3rd Qu.: 22.0 3rd Qu.: 7.7025 3rd Qu.: 441.534
## Max. :723.0 Max. :1800.0 Max. :17.8700 Max. :19479.912
## NA's :10 NA's :194
## Hepatitis.B Measles BMI under.five.deaths
## Min. : 1.00 Min. : 0.0 Min. : 1.00 Min. : 0.00
## 1st Qu.:77.00 1st Qu.: 0.0 1st Qu.:19.30 1st Qu.: 0.00
## Median :92.00 Median : 17.0 Median :43.50 Median : 4.00
## Mean :80.94 Mean : 2419.6 Mean :38.32 Mean : 42.04
## 3rd Qu.:97.00 3rd Qu.: 360.2 3rd Qu.:56.20 3rd Qu.: 28.00
## Max. :99.00 Max. :212183.0 Max. :87.30 Max. :2500.00
## NA's :553 NA's :34
## Polio Total.expenditure Diphtheria HIV.AIDS
## Min. : 3.00 Min. : 0.370 Min. : 2.00 Min. : 0.100
## 1st Qu.:78.00 1st Qu.: 4.260 1st Qu.:78.00 1st Qu.: 0.100
## Median :93.00 Median : 5.755 Median :93.00 Median : 0.100
## Mean :82.55 Mean : 5.938 Mean :82.32 Mean : 1.742
## 3rd Qu.:97.00 3rd Qu.: 7.492 3rd Qu.:97.00 3rd Qu.: 0.800
## Max. :99.00 Max. :17.600 Max. :99.00 Max. :50.600
## NA's :19 NA's :226 NA's :19
## GDP Population thinness..1.19.years
## Min. : 1.68 Min. :3.400e+01 Min. : 0.10
## 1st Qu.: 463.94 1st Qu.:1.958e+05 1st Qu.: 1.60
## Median : 1766.95 Median :1.387e+06 Median : 3.30
## Mean : 7483.16 Mean :1.275e+07 Mean : 4.84
## 3rd Qu.: 5910.81 3rd Qu.:7.420e+06 3rd Qu.: 7.20
## Max. :119172.74 Max. :1.294e+09 Max. :27.70
## NA's :448 NA's :652 NA's :34
## thinness.5.9.years Income.composition.of.resources Schooling
## Min. : 0.10 Min. :0.0000 Min. : 0.00
## 1st Qu.: 1.50 1st Qu.:0.4930 1st Qu.:10.10
## Median : 3.30 Median :0.6770 Median :12.30
## Mean : 4.87 Mean :0.6276 Mean :11.99
## 3rd Qu.: 7.20 3rd Qu.:0.7790 3rd Qu.:14.30
## Max. :28.60 Max. :0.9480 Max. :20.70
## NA's :34 NA's :167 NA's :163
As we can see from the summary above, there are some missing values in the dataset. Due to the fact that life.expectancy is the most important attribute for our analysis, we have decided to remove all rows where life.expectancy is NA.
data_new <- data[complete.cases(data[ , 'Life.expectancy']),]
After this operation, we still encounter missing values in the following columns: “Alcohol”, “Hepatitis.B”, “BMI”, “Polio”, “Total.expenditure”, “Diphtheria”, “GDP”, “Population”, “thinness..1.19.years”, “thinness.5.9.years”, “Income.composition.of.resources”, “Schooling”. We will fill these NA values with median for each column.
na_columns <- list("Alcohol", "Hepatitis.B", "BMI", "Polio", "Total.expenditure", "Diphtheria", "GDP", "Population", "thinness..1.19.years", "thinness.5.9.years", "Income.composition.of.resources", "Schooling")
for (col in na_columns){
m <- median(data_new[ , col], na.rm=TRUE)
print(col)
print(m)
data_new[ , col][is.na(data_new[ , col])] <- m
}
## [1] "Alcohol"
## [1] 3.77
## [1] "Hepatitis.B"
## [1] 92
## [1] "BMI"
## [1] 43.35
## [1] "Polio"
## [1] 93
## [1] "Total.expenditure"
## [1] 5.75
## [1] "Diphtheria"
## [1] 93
## [1] "GDP"
## [1] 1764.974
## [1] "Population"
## [1] 1391757
## [1] "thinness..1.19.years"
## [1] 3.3
## [1] "thinness.5.9.years"
## [1] 3.4
## [1] "Income.composition.of.resources"
## [1] 0.677
## [1] "Schooling"
## [1] 12.3
summary(data_new)
## Country Year Status Life.expectancy
## Length:2928 Min. :2000 Length:2928 Min. :36.30
## Class :character 1st Qu.:2004 Class :character 1st Qu.:63.10
## Mode :character Median :2008 Mode :character Median :72.10
## Mean :2008 Mean :69.22
## 3rd Qu.:2011 3rd Qu.:75.70
## Max. :2015 Max. :89.00
## Adult.Mortality infant.deaths Alcohol percentage.expenditure
## Min. : 1.0 Min. : 0.00 Min. : 0.010 Min. : 0.000
## 1st Qu.: 74.0 1st Qu.: 0.00 1st Qu.: 1.107 1st Qu.: 4.854
## Median :144.0 Median : 3.00 Median : 3.770 Median : 65.611
## Mean :164.8 Mean : 30.41 Mean : 4.559 Mean : 740.321
## 3rd Qu.:228.0 3rd Qu.: 22.00 3rd Qu.: 7.400 3rd Qu.: 442.614
## Max. :723.0 Max. :1800.00 Max. :17.870 Max. :19479.912
## Hepatitis.B Measles BMI under.five.deaths
## Min. : 1.00 Min. : 0.0 Min. : 1.00 Min. : 0.00
## 1st Qu.:82.00 1st Qu.: 0.0 1st Qu.:19.40 1st Qu.: 0.00
## Median :92.00 Median : 17.0 Median :43.35 Median : 4.00
## Mean :83.05 Mean : 2427.9 Mean :38.29 Mean : 42.18
## 3rd Qu.:96.00 3rd Qu.: 362.2 3rd Qu.:56.10 3rd Qu.: 28.00
## Max. :99.00 Max. :212183.0 Max. :77.60 Max. :2500.00
## Polio Total.expenditure Diphtheria HIV.AIDS
## Min. : 3.00 Min. : 0.370 Min. : 2.00 Min. : 0.100
## 1st Qu.:78.00 1st Qu.: 4.370 1st Qu.:78.00 1st Qu.: 0.100
## Median :93.00 Median : 5.750 Median :93.00 Median : 0.100
## Mean :82.62 Mean : 5.916 Mean :82.39 Mean : 1.748
## 3rd Qu.:97.00 3rd Qu.: 7.330 3rd Qu.:97.00 3rd Qu.: 0.800
## Max. :99.00 Max. :17.600 Max. :99.00 Max. :50.600
## GDP Population thinness..1.19.years
## Min. : 1.68 Min. :3.400e+01 Min. : 0.100
## 1st Qu.: 578.80 1st Qu.:4.181e+05 1st Qu.: 1.600
## Median : 1764.97 Median :1.392e+06 Median : 3.300
## Mean : 6627.39 Mean :1.026e+07 Mean : 4.834
## 3rd Qu.: 4793.63 3rd Qu.:4.593e+06 3rd Qu.: 7.100
## Max. :119172.74 Max. :1.294e+09 Max. :27.700
## thinness.5.9.years Income.composition.of.resources Schooling
## Min. : 0.100 Min. :0.0000 Min. : 0.00
## 1st Qu.: 1.600 1st Qu.:0.5040 1st Qu.:10.30
## Median : 3.400 Median :0.6770 Median :12.30
## Mean : 4.865 Mean :0.6301 Mean :12.02
## 3rd Qu.: 7.200 3rd Qu.:0.7730 3rd Qu.:14.10
## Max. :28.600 Max. :0.9480 Max. :20.70
As we can see, we have successfully dealt with missing values.
names(data_new)
## [1] "Country" "Year"
## [3] "Status" "Life.expectancy"
## [5] "Adult.Mortality" "infant.deaths"
## [7] "Alcohol" "percentage.expenditure"
## [9] "Hepatitis.B" "Measles"
## [11] "BMI" "under.five.deaths"
## [13] "Polio" "Total.expenditure"
## [15] "Diphtheria" "HIV.AIDS"
## [17] "GDP" "Population"
## [19] "thinness..1.19.years" "thinness.5.9.years"
## [21] "Income.composition.of.resources" "Schooling"
quantitive_cols = c("Year", "Life.expectancy", "Adult.Mortality", "infant.deaths", "Alcohol", "percentage.expenditure", "Hepatitis.B", "Measles", "BMI", "under.five.deaths", "Polio", "Total.expenditure", "Diphtheria","HIV.AIDS","GDP","Population", "thinness..1.19.years","thinness.5.9.years", "Income.composition.of.resources", "Schooling")
l <- htmltools::tagList()
i <- 1
for (col in quantitive_cols){
l[[i]] <- plot_ly(y = data_new[, col], type = "box",name=col, quartilemethod="exclusive")
i <- i + 1
}
l
p1 <- plot_ly(data_new, x = ~Status) %>%
add_histogram()
p1
correlation_data <- data_new[quantitive_cols]
fig <- plot_ly(
x = quantitive_cols,
y = quantitive_cols,
z = cor(correlation_data), type = "heatmap"
)
fig
The above chart gives us important information about the correlation between variables. As we can see, there is almost perfect correlation between thinness.1.19.years and thinness.5.9.years. There also appears to be a very strong correlation between GDP and percentage.expenditure (0.9), as well as infant.deaths and under.five.deaths (0.99). Naturally, we can also see a strong negative correlation between Adult.Mortality and Life.expectancy (-0.69). Schooling and life expectancy seem to be slightly correlated as well (0.71).
For our prediction, we will need to drop some of the features that are highly correlated. We will drop thinness.5.9.years, percentage.expenditure and infant.deaths, as their correlation to our decision variable is weaker than features correlated to them.
fig <- plot_ly(data, x = data_new[, 'Year'], y = data_new[, 'Life.expectancy'], name = ~data_new[, 'Country'], type = 'scatter', mode = 'lines', color=~data_new[, 'Country'])
fig <- fig %>% layout(legend = list(orientation = 'h'))
fig
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
# Data preparation
regression_data <- data_new[, !names(data_new) %in% c("Country", "thinness.5.9.years", "percentage.expenditure", "infant.deaths")]
kable(head(regression_data), "html") %>% kable_styling("striped") %>% scroll_box(width = "100%")
| Year | Status | Life.expectancy | Adult.Mortality | Alcohol | Hepatitis.B | Measles | BMI | under.five.deaths | Polio | Total.expenditure | Diphtheria | HIV.AIDS | GDP | Population | thinness..1.19.years | Income.composition.of.resources | Schooling |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2015 | Developing | 65.0 | 263 | 0.01 | 65 | 1154 | 19.1 | 83 | 6 | 8.16 | 65 | 0.1 | 584.25921 | 33736494 | 17.2 | 0.479 | 10.1 |
| 2014 | Developing | 59.9 | 271 | 0.01 | 62 | 492 | 18.6 | 86 | 58 | 8.18 | 62 | 0.1 | 612.69651 | 327582 | 17.5 | 0.476 | 10.0 |
| 2013 | Developing | 59.9 | 268 | 0.01 | 64 | 430 | 18.1 | 89 | 62 | 8.13 | 64 | 0.1 | 631.74498 | 31731688 | 17.7 | 0.470 | 9.9 |
| 2012 | Developing | 59.5 | 272 | 0.01 | 67 | 2787 | 17.6 | 93 | 67 | 8.52 | 67 | 0.1 | 669.95900 | 3696958 | 17.9 | 0.463 | 9.8 |
| 2011 | Developing | 59.2 | 275 | 0.01 | 68 | 3013 | 17.2 | 97 | 68 | 7.87 | 68 | 0.1 | 63.53723 | 2978599 | 18.2 | 0.454 | 9.5 |
| 2010 | Developing | 58.8 | 279 | 0.01 | 66 | 1989 | 16.7 | 102 | 66 | 9.20 | 66 | 0.1 | 553.32894 | 2883167 | 18.4 | 0.448 | 9.2 |
Train test valid split
trainval_partition <-
createDataPartition(
y = regression_data$Life.expectancy,
p = .8,
list = FALSE)
trainval_data <- regression_data[ trainval_partition,]
test_data <- regression_data[-trainval_partition,]
train_partition <-
createDataPartition(
y = trainval_data$Life.expectancy,
p = .8,
list = FALSE)
train_data <- trainval_data[ train_partition,]
val_data <- trainval_data[-train_partition,]
control <- trainControl(
method = "repeatedcv",
number = 10,
repeats = 5)
linear <- train(Life.expectancy ~ .,
data = train_data,
trControl = control,
method = "lm")
linear
## Linear Regression
##
## 1878 samples
## 17 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 1690, 1690, 1690, 1690, 1690, 1690, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 4.232244 0.8023078 3.161511
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
lasso <- train(Life.expectancy ~ .,
data = train_data,
trControl = control,
method = "lasso")
lasso
## The lasso
##
## 1878 samples
## 17 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 1690, 1690, 1689, 1690, 1688, 1691, ...
## Resampling results across tuning parameters:
##
## fraction RMSE Rsquared MAE
## 0.1 8.521102 0.6982996 6.929972
## 0.5 5.459212 0.7575138 4.110708
## 0.9 4.246008 0.8012071 3.155828
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was fraction = 0.9.
ridge <- train(Life.expectancy ~ .,
data = train_data,
trControl = control,
method = "ridge")
ridge
## Ridge Regression
##
## 1878 samples
## 17 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 1689, 1691, 1691, 1689, 1689, 1690, ...
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared MAE
## 0e+00 4.227943 0.8031544 3.162105
## 1e-04 4.227943 0.8031550 3.162129
## 1e-01 4.275968 0.8029778 3.231954
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was lambda = 1e-04.
ggplot(varImp(lasso))